library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plotme)
df_raw = read.delim("./Data/nhwsdata.txt") #|>
DBDX 1=Diagnosed, 2=Not diagnosed, Missing = Not reporting diabetes DBTYPE 1=Type 1, 2=Type 2, 3=Unknown DBRX 1=Using an oral medication, 2=Not using an oral medication DBIN 1=Using an insulin, 2=Not using an insulin
df = df_raw |>
# deal with nominal data
mutate(DBTYPE = if_else(is.na(DBTYPE), 0, DBTYPE),
DBRX = if_else(DBRX == 1, 1, 0),
DBIN = if_else(DBIN == 1, 1, 0)) |>
mutate(across(c(DESEX, DBDX,DBTYPE,DBRX,DBIN), ~factor(.))) |>
mutate(across(c(BP, GH, MH, PF, RE, RP,VT, SF,mmas), ~factor(., ordered = TRUE)))
df_encode = df |>
mutate(DESEX = factor(DESEX, levels = c(1, 2), labels = c("Male", "Female")),
DBDX = factor(DBDX, levels = c(0, 1), labels = c("Not Diagnosed", "Diagnosed")),
DBTYPE = factor(DBTYPE, levels = c(0, 1, 2), labels = c("Not Diagnosed", "Type 1", "Type 2")),
DBRX = factor(DBRX, levels = c(0, 1), labels = c("Not Using Oral", "Using Oral")),
DBIN = factor(DBIN, levels = c(0, 1), labels = c("Not Using Insulin", "Using Insulin")))
df |> skimr:: skim()
| Name | df |
| Number of rows | 75000 |
| Number of columns | 22 |
| _______________________ | |
| Column type frequency: | |
| factor | 14 |
| numeric | 8 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| BP | 0 | 1.00 | TRUE | 5 | 100: 33803, 75: 24044, 50: 9809, 25: 5537 |
| DESEX | 0 | 1.00 | FALSE | 2 | 2: 38353, 1: 36647 |
| GH | 0 | 1.00 | TRUE | 5 | 85: 29988, 60: 24840, 100: 10211, 25: 8288 |
| MH | 0 | 1.00 | TRUE | 9 | 87.: 19548, 75: 16617, 62.: 11466, 50: 10728 |
| PF | 0 | 1.00 | TRUE | 5 | 100: 44167, 75: 11514, 50: 9269, 25: 5352 |
| RE | 0 | 1.00 | TRUE | 9 | 100: 40691, 75: 9330, 50: 7210, 87.: 6405 |
| RP | 0 | 1.00 | TRUE | 9 | 100: 32740, 75: 9959, 50: 8110, 87.: 8043 |
| SF | 0 | 1.00 | TRUE | 5 | 100: 42352, 75: 14258, 50: 11135, 25: 5155 |
| VT | 0 | 1.00 | TRUE | 5 | 75: 26031, 50: 25880, 25: 13262, 0: 5010 |
| DBDX | 0 | 1.00 | FALSE | 2 | 0: 66655, 1: 8345 |
| DBIN | 67720 | 0.10 | FALSE | 2 | 0: 4983, 1: 2297 |
| DBRX | 66629 | 0.11 | FALSE | 2 | 1: 7280, 0: 1091 |
| DBTYPE | 0 | 1.00 | FALSE | 3 | 0: 66655, 2: 7828, 1: 517 |
| mmas | 67720 | 0.10 | TRUE | 5 | 0: 4652, 1: 1672, 2: 705, 3: 175 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| zKey | 0 | 1.00 | 551260.41 | 74229.92 | 13334.00 | 533041.75 | 563791.50 | 582541.25 | 601291.00 | ▁▁▁▁▇ |
| BMI | 1340 | 0.98 | 28.54 | 6.99 | 10.00 | 23.73 | 27.32 | 31.90 | 89.95 | ▆▇▁▁▁ |
| DEAGE | 0 | 1.00 | 48.36 | 16.66 | 18.00 | 34.00 | 48.00 | 63.00 | 100.00 | ▇▇▇▅▁ |
| MCS | 0 | 1.00 | 48.89 | 10.66 | -0.94 | 42.18 | 51.49 | 57.16 | 76.94 | ▁▁▃▇▁ |
| PCS | 0 | 1.00 | 48.70 | 10.36 | 4.85 | 43.23 | 52.09 | 56.15 | 75.13 | ▁▁▃▇▁ |
| RUHP6Q | 0 | 1.00 | 0.11 | 0.86 | 0.00 | 0.00 | 0.00 | 0.00 | 88.00 | ▇▁▁▁▁ |
| WPACTIMP | 0 | 1.00 | 21.90 | 28.29 | 0.00 | 0.00 | 10.00 | 40.00 | 100.00 | ▇▁▂▁▁ |
| CCI | 0 | 1.00 | 0.41 | 1.00 | 0.00 | 0.00 | 0.00 | 0.00 | 32.00 | ▇▁▁▁▁ |
# Detecting continuous and categorical variables
continuous_vars <- c()
categorical_vars <- c()
for(col_name in names(df)) {
# Assuming continuous variables are of type numeric and have more unique values
if(is.numeric(df[[col_name]]) && length(unique(df[[col_name]])) > 10) {
continuous_vars <- c(continuous_vars, col_name)
} else {
categorical_vars <- c(categorical_vars, col_name)
}
}
continuous_vars = continuous_vars[-1]
df_encode |>
count(DBDX, DBTYPE, DBIN, DBRX#,wt = RUHP6Q
) |>
count_to_sunburst(fill_by_n = TRUE)
df |> filter(!is.na(BMI)) |> group_by(DBDX) |> summarise(mean(BMI))
## # A tibble: 2 × 2
## DBDX `mean(BMI)`
## <fct> <dbl>
## 1 0 28.0
## 2 1 33.2
df = df |>
mutate(BMI = if_else(is.na(BMI) & DBDX == 0,27.95397,
if_else(is.na(BMI) & DBDX == 1, 33.21504,BMI)))
df_encode = df_encode |>
mutate(BMI = if_else(is.na(BMI) & DBDX == 0,27.95397,
if_else(is.na(BMI) & DBDX == 1, 33.21504,BMI)))
ggplot(df_encode, aes(x = BMI,col = DBDX)) +geom_density()
## Warning: Removed 1340 rows containing non-finite values (`stat_density()`).
cat_result = tibble(
vars = categorical_vars) |>
filter(!(vars %in% c("DBRX","DBIN","DBDX","DBTYPE","mmas"))) |> # filter these out for 0 values
mutate(summary = map(vars, \(x) chisq.test(df[[x]], df[['DBDX']])),
summary = map(summary, broom::tidy ))|>
unnest(summary) |>
select(vars,statistic,p.value) |>
mutate(if_sig = if_else(p.value < 0.05,1,0))
cat_result
## # A tibble: 9 × 4
## vars statistic p.value if_sig
## <chr> <dbl> <dbl> <dbl>
## 1 BP 1987. 0 1
## 2 DESEX 668. 2.28e-147 1
## 3 GH 5372. 0 1
## 4 MH 172. 4.76e- 33 1
## 5 PF 3278. 0 1
## 6 RE 526. 1.91e-108 1
## 7 RP 2623. 0 1
## 8 SF 575. 3.18e-123 1
## 9 VT 1291. 3.06e-278 1
con_result = tibble(
vars = continuous_vars) |>
#filter(!(vars %in% c("zKey"))) |> # filter these out for 0 values
mutate(#x = df|>filter(DBDX == 0) |> select(vars))
summary = map(vars, \(x) wilcox.test(df|>filter(DBDX == 0) |> select(x) |> pull(),
df|>filter(DBDX == 1) |> select(x) |> pull())),
summary = map(summary, broom::tidy ))|>
unnest(summary) |>
select(vars,statistic,p.value) |>
mutate(if_sig = if_else(p.value < 0.05,1,0))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `summary = map(...)`.
## Caused by warning:
## ! Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(x)
##
## # Now:
## data %>% select(all_of(x))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
con_result
## # A tibble: 7 × 4
## vars statistic p.value if_sig
## <chr> <dbl> <dbl> <dbl>
## 1 BMI 156041410. 0 1
## 2 DEAGE 163195945 0 1
## 3 MCS 276598876 4.15e- 1 0
## 4 PCS 401515408 0 1
## 5 RUHP6Q 260082172. 2.07e-113 1
## 6 WPACTIMP 203637026 0 1
## 7 CCI 43254997 0 1
corrplot::corrplot(cor(df |> select(any_of(continuous_vars))), "number")